#Loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(broom)
# Loading data
marathon_results<-read_csv("/Users/prathyushabhuma/Documents/Florida Polytechnic University/Data Visualization and Reproducible Research/dataviz_final_project/data/marathon_results_2017.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 26410 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): Bib, Name, M/F, City, State, Country, 10K, 15K, 20K, Proj Time
## dbl   (4): Age, Overall, Gender, Division
## time  (8): 5K, Half, 25K, 30K, 35K, 40K, Pace, Official Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dimensions 
dim(marathon_results)
## [1] 26410    22
head(marathon_results)
# Display the structure  of the dataset
str(marathon_results)
## spc_tbl_ [26,410 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Bib          : chr [1:26410] "11" "17" "23" "21" ...
##  $ Name         : chr [1:26410] "Kirui, Geoffrey" "Rupp, Galen" "Osako, Suguru" "Biwott, Shadrack" ...
##  $ Age          : num [1:26410] 24 30 25 32 31 40 33 28 27 28 ...
##  $ M/F          : chr [1:26410] "M" "M" "M" "M" ...
##  $ City         : chr [1:26410] "Keringet" "Portland" "Machida-City" "Mammoth Lakes" ...
##  $ State        : chr [1:26410] NA "OR" NA "CA" ...
##  $ Country      : chr [1:26410] "KEN" "USA" "JPN" "USA" ...
##  $ 5K           : 'hms' num [1:26410] 00:15:25 00:15:24 00:15:25 00:15:25 ...
##   ..- attr(*, "units")= chr "secs"
##  $ 10K          : chr [1:26410] "0:30:28" "0:30:27" "0:30:29" "0:30:29" ...
##  $ 15K          : chr [1:26410] "0:45:44" "0:45:44" "0:45:44" "0:45:44" ...
##  $ 20K          : chr [1:26410] "1:01:15" "1:01:15" "1:01:16" "1:01:19" ...
##  $ Half         : 'hms' num [1:26410] 01:04:35 01:04:35 01:04:36 01:04:45 ...
##   ..- attr(*, "units")= chr "secs"
##  $ 25K          : 'hms' num [1:26410] 01:16:59 01:16:59 01:17:00 01:17:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ 30K          : 'hms' num [1:26410] 01:33:01 01:33:01 01:33:01 01:33:01 ...
##   ..- attr(*, "units")= chr "secs"
##  $ 35K          : 'hms' num [1:26410] 01:48:19 01:48:19 01:48:31 01:48:58 ...
##   ..- attr(*, "units")= chr "secs"
##  $ 40K          : 'hms' num [1:26410] 02:02:53 02:03:14 02:03:38 02:04:35 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Pace         : 'hms' num [1:26410] 00:04:57 00:04:58 00:04:59 00:05:03 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Proj Time    : chr [1:26410] "-" "-" "-" "-" ...
##  $ Official Time: 'hms' num [1:26410] 02:09:37 02:09:58 02:10:28 02:12:08 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Overall      : num [1:26410] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender       : num [1:26410] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Division     : num [1:26410] 1 2 3 4 5 1 6 7 8 9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Bib = col_character(),
##   ..   Name = col_character(),
##   ..   Age = col_double(),
##   ..   `M/F` = col_character(),
##   ..   City = col_character(),
##   ..   State = col_character(),
##   ..   Country = col_character(),
##   ..   `5K` = col_time(format = ""),
##   ..   `10K` = col_character(),
##   ..   `15K` = col_character(),
##   ..   `20K` = col_character(),
##   ..   Half = col_time(format = ""),
##   ..   `25K` = col_time(format = ""),
##   ..   `30K` = col_time(format = ""),
##   ..   `35K` = col_time(format = ""),
##   ..   `40K` = col_time(format = ""),
##   ..   Pace = col_time(format = ""),
##   ..   `Proj Time` = col_character(),
##   ..   `Official Time` = col_time(format = ""),
##   ..   Overall = col_double(),
##   ..   Gender = col_double(),
##   ..   Division = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Display the  summary of the dataset
summary(marathon_results)
##      Bib                Name                Age            M/F           
##  Length:26410       Length:26410       Min.   :18.00   Length:26410      
##  Class :character   Class :character   1st Qu.:34.00   Class :character  
##  Mode  :character   Mode  :character   Median :43.00   Mode  :character  
##                                        Mean   :42.59                     
##                                        3rd Qu.:51.00                     
##                                        Max.   :84.00                     
##      City              State             Country               5K          
##  Length:26410       Length:26410       Length:26410       Length:26410     
##  Class :character   Class :character   Class :character   Class1:hms       
##  Mode  :character   Mode  :character   Mode  :character   Class2:difftime  
##                                                           Mode  :numeric   
##                                                                            
##                                                                            
##      10K                15K                20K                Half         
##  Length:26410       Length:26410       Length:26410       Length:26410     
##  Class :character   Class :character   Class :character   Class1:hms       
##  Mode  :character   Mode  :character   Mode  :character   Class2:difftime  
##                                                           Mode  :numeric   
##                                                                            
##                                                                            
##      25K               30K               35K               40K          
##  Length:26410      Length:26410      Length:26410      Length:26410     
##  Class1:hms        Class1:hms        Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime   Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric    Mode  :numeric    Mode  :numeric   
##                                                                         
##                                                                         
##      Pace           Proj Time         Official Time        Overall     
##  Length:26410      Length:26410       Length:26410      Min.   :    1  
##  Class1:hms        Class :character   Class1:hms        1st Qu.: 6604  
##  Class2:difftime   Mode  :character   Class2:difftime   Median :13206  
##  Mode  :numeric                       Mode  :numeric    Mean   :13206  
##                                                         3rd Qu.:19809  
##                                                         Max.   :26411  
##      Gender         Division   
##  Min.   :    1   Min.   :   1  
##  1st Qu.: 3302   1st Qu.: 502  
##  Median : 6604   Median :1154  
##  Mean   : 6661   Mean   :1589  
##  3rd Qu.: 9905   3rd Qu.:2191  
##  Max.   :14438   Max.   :5846
colSums(is.na(marathon_results))
##           Bib          Name           Age           M/F          City 
##             0             0             0             0             0 
##         State       Country            5K           10K           15K 
##          3595             0            25             0             0 
##           20K          Half           25K           30K           35K 
##             0            17            40            25            23 
##           40K          Pace     Proj Time Official Time       Overall 
##             6             0             0             0             0 
##        Gender      Division 
##             0             0
# 1. Interactive Plot
# Plot distribution of marathon finish times
finish_time_plot <- ggplot(marathon_results, aes(x = `Official Time`)) +
  geom_histogram(binwidth = 300, fill = "blue", color = "white") +
  labs(title = "Distribution of Marathon Finish Times", x = "Finish Time", y = "Count")

finish_time_plot

# Convert to interactive plot using plotly
interactive_finish_time_plot <- ggplotly(finish_time_plot)
interactive_finish_time_plot
# Save the interactive plot as an HTML file
htmlwidgets::saveWidget(interactive_finish_time_plot, "interactive_finish_time_plot.html")

`

# Load world shapefile from Natural Earth
# https://www.naturalearthdata.com/downloads/110m-cultural-vectors/
world_shapes <- read_sf("/Users/prathyushabhuma/Documents/Florida Polytechnic University/Data Visualization and Reproducible Research/MiniProject2/Data/ne_110m_admin_0_countries")
head(world_shapes)
# Create a map of Participants origins 
# We'll map the count of Participants from each country if available
country_count <- marathon_results %>%
  group_by(Country) %>%
  summarize(users = n())

world_shapes <- world_shapes %>%
  mutate(join_key = if_else(ISO_A3_EH == "NOR", SU_A3, ISO_A3_EH))

country_counts <- country_count %>%
  mutate(join_key = if_else(Country == "NOR", "NOR", Country))

map_data <- left_join(world_shapes, country_counts, by = c("SU_A3" = "join_key"))

# Plot the map with tmaps
ggplot(map_data) +
  geom_sf(aes(fill = users)) +
  scale_fill_gradient(low = "lightgreen", high = "orange", na.value = "lightgray", name = "Participants") +
  ggtitle("Marathon Participants by Country") +
  theme_minimal() +
  theme(plot.title = element_text(size = 18, face = "bold"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 10),
        legend.key.size = unit(2, 'cm'),
        legend.key.height = unit(1, 'cm'),
        legend.key.width = unit(1, 'cm'),
        legend.position = "bottom",
        panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.title = element_blank())

## Linear model predicting finish time based on age and gender
# Convert 'Official Time' to numeric (total minutes)
marathon_results <- marathon_results %>%
  mutate(Official_Time_Minutes = as.numeric(hms::as_hms(`Official Time`)) / 60) %>%
  mutate(`M/F` = as.factor(`M/F`))
  

# Fit the linear model predicting finish time based on age and gender
lm_model <- lm(Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)

# Display the model summary
summary(lm_model)
## 
## Call:
## lm(formula = Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -104.085  -27.548   -9.507   18.620  245.891 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 206.61783    0.93404  221.21   <2e-16 ***
## Age           1.06309    0.02159   49.23   <2e-16 ***
## `M/F`M      -25.30338    0.49531  -51.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.18 on 26407 degrees of freedom
## Multiple R-squared:  0.1361, Adjusted R-squared:  0.136 
## F-statistic:  2080 on 2 and 26407 DF,  p-value: < 2.2e-16
# Plot model coefficients
coef_plot <- tidy(lm_model) %>%
  ggplot(aes(x = term, y = estimate, ymin = estimate - std.error, ymax = estimate + std.error)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
  geom_errorbar(width = 0.2, color = "black") +
  labs(title = "Linear Model Coefficients", x = "Term", y = "Estimate") +
  theme_minimal()

# Display the plot
print(coef_plot)

table(marathon_results$`M/F`)
## 
##     F     M 
## 11972 14438
# Extension to the original mini project that was submitted earlier based on the feedback provided

# Displaying the summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -104.085  -27.548   -9.507   18.620  245.891 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 206.61783    0.93404  221.21   <2e-16 ***
## Age           1.06309    0.02159   49.23   <2e-16 ***
## `M/F`M      -25.30338    0.49531  -51.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.18 on 26407 degrees of freedom
## Multiple R-squared:  0.1361, Adjusted R-squared:  0.136 
## F-statistic:  2080 on 2 and 26407 DF,  p-value: < 2.2e-16
# Plotting diagnostic plots to check the assumptions of the linear model
par(mfrow = c(2, 2))  # Arrange plots in a 2x2 grid
plot(lm_model)

par(mfrow = c(1, 1))  # Reset plot layout

# Visualizing actual vs. predicted values
marathon_results$predicted_time <- predict(lm_model, marathon_results)

ggplot(marathon_results, aes(x = Age, y = Official_Time_Minutes, color = `M/F`)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = predicted_time), color = "black", linetype = "dashed") +
  labs(title = "Actual vs. Predicted Finishing Time by Age and Gender",
       x = "Age",
       y = "Finishing Time (Minutes)") +
  scale_color_manual(values = c("M" = "blue", "F" = "pink")) +
  theme_minimal()

Comment: The scatter plot shows the relationship between the official finishing times (in minutes) and the ages of marathon participants, which distinguishes the gender of male and female runners by color. Blue dots represent male finishers, while pink dots represent female finishers. The black dashed line illustrates the linear regression model’s predicted finishing times based on age and gender. The plot reveals that finishing times tend to increase with age, and on average, males generally have faster finishing times compared to females. The spread of data indicates variability in finishing times across different ages and genders, highlighting the complex factors influencing marathon performance.